We have a set of air passenger data from 2015 to 2017 and for future flights, we want to be able to predict the passengers who will fly based on past passenger data, flight conditions or other variables that may influence.
For this, the following is requested:
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import re
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
import statsmodels.tsa.stattools as sts
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from sklearn.metrics import mean_absolute_error, mean_squared_error
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot
from plotly.subplots import make_subplots
import plotly
warnings.filterwarnings("ignore")
df = pd.read_csv('data_passenger.csv')
df.shape
(790, 9)
df.columns
Index(['date', 'month', 'holiday', 'num_passengers', 'event_intensity',
'rain_intensity', 'traffic_occupancy', 'week_of_month', 'day_of_week'],
dtype='object')
df.head()
| date | month | holiday | num_passengers | event_intensity | rain_intensity | traffic_occupancy | week_of_month | day_of_week | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-01 | 1 | 1 | 1125 | 0 | 0.0 | 0.0 | 0 | 3 |
| 1 | 2015-01-02 | 1 | 0 | 3592 | 0 | 0.0 | 0.0 | 0 | 4 |
| 2 | 2015-01-03 | 1 | 0 | 3001 | 0 | 0.0 | 0.0 | 0 | 5 |
| 3 | 2015-01-04 | 1 | 0 | 2260 | 0 | 0.0 | 0.0 | 0 | 6 |
| 4 | 2015-01-05 | 1 | 0 | 2767 | 0 | 0.0 | 0.0 | 0 | 0 |
df.isna().sum()
date 0 month 0 holiday 0 num_passengers 0 event_intensity 0 rain_intensity 0 traffic_occupancy 0 week_of_month 0 day_of_week 0 dtype: int64
We observe that we do not have null values in any of the variables of our DataFrame.
df.dtypes
date object month int64 holiday int64 num_passengers int64 event_intensity int64 rain_intensity float64 traffic_occupancy float64 week_of_month int64 day_of_week int64 dtype: object
We observe that in our data set we may be missing some important variables such as:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
Taking into account that the seasons of the year are usually important for many people when making decisions about whether they travel or not, that is, it could be an exogenous variable of great importance for our model, we are going to add a new variable to our dataframe with each period. . of the year called seasons, which will be:
def classify_seasons(df:pd.DataFrame, years:list):
"""
DESCRIPTION:
This function is responsible for creating and classifying the rows by type of season (summer, autumn, winter and spring).
PARAMETERS:
df (DatFrame): Data set with the time series.
year (list): list of years.
RETURN:
df (DataFrame): data set passed by parameters but with the "station" column.
"""
# We define the periods by type of seasons
summer = ['12-21', '03-20']
autumn = ['03-21', '06-20']
winter = ['06-21', '09-20']
spring = ['09-21', '12-20']
for year in years:
idx = df[(df['date'] >= str(year) + '-' +summer[0]) & (df['date'] <= str(year+1) + '-' +summer[1])].index
df.loc[idx, 'seasons'] = 'summer'
idx = df[(df['date'] >= str(year) + '-' +autumn[0]) & (df['date'] <= str(year) + '-' + autumn[1])].index
df.loc[idx, 'seasons'] = 'autumn'
idx = df[(df['date'] >= str(year) + '-' +winter[0]) & (df['date'] <= str(year) + '-' + winter[1])].index
df.loc[idx, 'seasons'] = 'winter'
idx = df[(df['date'] >= str(year) + '-' + spring[0]) & (df['date'] <= str(year) + '-' + spring[1])].index
df.loc[idx, 'seasons'] = 'spring'
return df
years = [2014, 2015, 2016, 2017, 2018]
df = classify_seasons(df, years)
df.seasons.value_counts()
seasons summer 240 autumn 184 winter 184 spring 182 Name: count, dtype: int64
def graphics_transparent(fig:plotly.graph_objs._figure.Figure, bg_transparent:bool, x_title:str, y_title:str, width:int, height:int, rotate:int):
"""
DESCRIPTION:
This function is responsible for graphing with a transparent background or with a white background, depending on the value of bg_transparent.
PARAMETERS:
bg_transparent (bool): this variable defines whether the background of the image will be transparent or white. If True the background is transparent.
x_title (str): x-axis title.
y_title (str): y-axis title.
width (int): width of the chart.
height (int): height of the graph.
rotate (int): angle of rotation of the letter.
RETURN:
fig (plotly.graph_objs._figure.Figure): plot with the specified characteristics.
"""
if bg_transparent:
fig.update_layout(
xaxis = dict(
title = "<i>"+x_title+"</i>",
tickangle = -90,
titlefont = dict(
family = 'Raleway, monospace',
size = 22,
color = 'white'),
tickfont = dict(
family = 'Arial',
size = 15,
color = 'white'),
type='category',
gridcolor = '#807d89'),
yaxis= dict(
title = "<i>"+y_title+"</i>",
titlefont = dict(
family = 'Raleway, monospace',
size = 22,
color = 'white'),
tickfont = dict(
family = 'Arial',
size = 15,
color = 'white'),
gridcolor = '#807d89'),
legend=dict(
font=dict(
family = 'Raleway, monospace',
size = 16,
color = 'white')),
barmode='stack',
width=width,
height=height
)
fig.update_traces(textfont_color='white')
fig.layout.plot_bgcolor = 'rgba(0,0,0,0)'
fig.layout.paper_bgcolor = 'rgba(0,0,0,0)'
else:
fig.update_layout(
xaxis = dict(
title = "<i>"+x_title+"</i>",
tickangle = -90,
titlefont = dict(
family = 'Raleway, monospace',
size = 22,
color = 'black'),
tickfont = dict(
family = 'Arial',
size = 15,
color = 'black'),
type='category',
gridcolor = '#807d89'),
yaxis= dict(
title = "<i>"+y_title+"</i>",
titlefont = dict(
family = 'Raleway, monospace',
size = 22,
color = 'black'),
tickfont = dict(
family = 'Arial',
size = 15,
color = 'black'),
gridcolor = '#807d89'),
legend=dict(
font=dict(
family = 'Raleway, monospace',
size = 16,
color = 'black')),
barmode='stack',
width=width,
height=height
)
fig.update_traces(textfont_color='black')
fig.update_layout(xaxis_tickangle=rotate)
return fig
def graphics_scatter(df:pd.DataFrame, column1:str, column2:str, x_title:str, y_title:str, colors:str, width:int, height:int, bg_transparent:bool, rotate:int):
"""
DESCRIPTION:
This function makes a timeline graph.
PARAMETERS:
bg_transparent (bool): this variable defines whether the background of the image will be transparent or white. If True the background is transparent.
x_title (str): x-axis title.
y_title (str): y-axis title.
colors (str): color of the graph.
width (int): width of the chart.
height (int): height of the graph.
bg_transparent (bool): defines whether the background of the graph will be transparent or with a white background. If this value is True
the background will be transparent, on the contrary, the background will be white.
rotate (int): angle of rotation of the letter.
RETURN:
fig (plotly.graph_objs._figure.Figure): plot with the specified characteristics.
"""
list_value_scatter = list(df[column1].unique())
dict_dfs = dict()
try:
list_month = {
1: 'January',
2: 'February',
3: 'March',
4: 'April',
5: 'May',
6: 'June',
7: 'July',
8: 'August',
9: 'September',
10: 'October',
11: 'November',
12: 'December'
}
for value in list_value_scatter:
df_temp = df[(df[column1] == value)][['month', column2]].groupby('month').sum(column2).reset_index()
dict_dfs[str(value)] = df_temp.replace(list_month)
fig = go.Figure(data=[
go.Scatter(name='<i>'+list(dict_dfs.keys())[0]+'</i>', x=dict_dfs[str(list_value_scatter[0])]['month'].astype(str),
y=dict_dfs[str(list_value_scatter[0])][column2], mode='lines+ markers',
marker_color=colors[0], text=dict_dfs[str(list_value_scatter[0])][column2], marker_line_color=colors[0]),
go.Scatter(name='<i>'+list(dict_dfs.keys())[1]+'</i>', x=dict_dfs[str(list_value_scatter[1])]['month'].astype(str),
y=dict_dfs[str(list_value_scatter[1])][column2], mode='lines+ markers',
marker_color=colors[1], text=dict_dfs[str(list_value_scatter[1])][column2], marker_line_color=colors[1]),
go.Scatter(name='<i>'+list(dict_dfs.keys())[2]+'</i>', x=dict_dfs[str(list_value_scatter[2])]['month'].astype(str),
y=dict_dfs[str(list_value_scatter[2])][column2], mode='lines+ markers',
marker_color=colors[2], text=dict_dfs[str(list_value_scatter[2])][column2], marker_line_color=colors[2]),
])
except:
fig = go.Figure(data=[
go.Scatter(x=df[column1].astype(str),
y=df[column2], mode='lines+ markers',
marker_color=colors[0], text=df[column2], marker_line_color=colors[0]),
])
fig = graphics_transparent(fig, bg_transparent, x_title, y_title, width, height, rotate)
return fig
def graphics_bar(df:pd.DataFrame, column1:str, column2:str, x_title:str, y_title:str, colors:str, width:int, height:int, bg_transparent:bool, rotate:int, typ:str):
"""
DESCRIPTION:
This function is responsible for creating bar graphs.
PARAMETERS:
df (DataFrame): data set.
column1 (str): variable by which the data will be grouped.
column2 (str): target variable. a groupby will be made from "column1" and a sum from "column2".
x_title (str): x-axis title.
y_title (str): y-axis title.
colors (str): color of the graph.
width (int): width of the chart.
height (int): height of the graph.
bg_transparent (bool): defines whether the background of the graph will be transparent or with a white background. If this value is True
the background will be transparent, on the contrary, the background will be white.
rotate (int): angle of rotation of the letter.
RETURN:
fig (plotly.graph_objs._figure.Figure): plot with the specified characteristics.
"""
# groupby.
df_groupby = df[[column1, column2]].groupby(column1).sum(column2).reset_index()
if typ == 'week':
days_of_week = {
0: "Monday",
1: "Tuesday",
2: "Wednesday",
3: "Thursday",
4: "Friday",
5: "Saturday",
6: "Sunday"
}
df_groupby[column1] = df_groupby.day_of_week.replace(days_of_week)
# df_groupby = df_groupby.set_index(column1).reindex(index = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
# ).reset_index()
# df.drop(columns=[column1+'_name'], inplace=True)
elif typ == 'holiday':
df_groupby = df_groupby.replace({0:'holiday', 1:'Non-holiday'})
else:
pass
# gráfica de barra.
fig = go.Figure(data=[
go.Bar(x=df_groupby[column1], y=df_groupby[column2], marker_color=colors,text=df_groupby[column2],textposition='auto',marker_line_color=colors),
])
# fig con las características especificadas.
fig = graphics_transparent(fig, bg_transparent, x_title, y_title, width, height, rotate)
return fig
Let's visualize the evolution of passengers as a function of time.
df.columns
Index(['date', 'month', 'holiday', 'num_passengers', 'event_intensity',
'rain_intensity', 'traffic_occupancy', 'week_of_month', 'day_of_week',
'year', 'seasons'],
dtype='object')
graphics_scatter(
df = df,
column1 = 'year',
column2 = 'num_passengers',
x_title = 'Months of the year',
y_title = 'Number of passengers',
colors = ['red', 'yellow', 'orange'],
width = 1200,
height = 600,
bg_transparent = False,
rotate = -45
)
It can be seen that the graphs for the years 2015 and 2016 present a similar behavior, that is, the time series has seasonality.
Now let's see graphically how the exogenous variables of our time series behave.
graphics_bar(
df = df,
column1 = 'day_of_week',
column2 = 'num_passengers',
x_title = 'Day of the week',
y_title = 'Number of passengers',
colors='blue',
width = 1000,
height = 600,
bg_transparent = False,
rotate = -45,
typ = 'week'
)
In the graph above you can see that there does not seem to be much difference between the number of passengers on the days from Monday to Friday, with the day with the highest number of passengers being Wednesday. In addition, you can see that Saturday and Sunday are the days with the highest number of passengers. days when fewer people travel.
In the graph you can see that people usually travel, mostly, between January and February, while the month where there was the least number of passengers was in the month of August.
df.columns
Index(['date', 'month', 'holiday', 'num_passengers', 'event_intensity',
'rain_intensity', 'traffic_occupancy', 'week_of_month', 'day_of_week',
'year', 'seasons'],
dtype='object')
graphics_bar(
df = df,
column1 = 'seasons',
column2 = 'num_passengers',
x_title = 'Seasons of the year',
y_title = 'Number of passengers',
colors='orange',
width = 1000,
height = 600,
bg_transparent = False,
rotate = -45,
typ = 'seasons'
)
Above you can see that people, for the most part, tend to travel in summer, with a total of 776,881 passengers in this season, while the season of the year where there were fewer passengers was in winter, which indicates that people usually prefer to stay at home in winter.
graphics_bar(
df = df,
column1 = 'holiday',
column2 = 'num_passengers',
x_title = 'Holiday / Non-Holiday',
y_title = 'Nnmber of passenger',
colors='green',
width = 1000,
height = 600,
bg_transparent = False,
rotate = -45,
typ = 'holiday'
)
graphics_bar(
df = df,
column1 = 'year',
column2 = 'num_passengers',
x_title = 'Years of study',
y_title = 'Number of passengers',
colors='#52874F',
width = 1000,
height = 600,
bg_transparent = False,
rotate = -45,
typ = 'holiday'
)
The graph above shows that the number during 2015 was a total of 1,100,435 passengers, while in 2016 there was an increase of 32.967 passengers, which could indicate that for the year 2017 there may also be a increase in the number of passengers.
AntesBefore using the Dickey-Fuller test, let's decompose our time series. utilizar el test de Dickey-Fuller hagamos la descomposición de nuestra serie temporal.
Previously, when we looked at the time series graph of our data set, we observed that the variation in the amplitude of the time series appears to be constant as the series progresses, so ideally we would use an additive model. However, we will use both models and see which model our time series performs better with.
def descomposition_time_series(df:pd.DataFrame, column_date:str, column_value:str, model:str = 'additive', all_model:bool = True, merge:bool = False):
'''
DESCRIPTION:
This function is responsible for generating the decomposition of a time series and adding the decomposition values (trend, seasonal and residual)
to the data set passed by parameter.
PARAMETERS:
df (DataFrame): data set containing the time series to be decomposed.
column_date (String): name of the column that contains the dates of the time series.
column_value (String): name of the column that contains the values and based on the time series.
all_model (String): name of the model with which you want to decompose the time series. By default the value of model is "additive".
all_model (Boolean): if this value is "True" both models are used and columns are created with the trend, seasonal and residual values
specifying the model used.
RETURN:
df_decomposition or df (DataFrame): returns a dataframe with the decomposition variables "trend_model", "seasonal_model" and "residual_model",
where "model" is the name of the model that was used for the decomposition of the time series.
'''
# If the data set does not contain the time series values as an index, a set_index is done.
if df.index.name != column_date:
df = df.set_index(column_date)
else:
pass
# We create a dataframe that will contain the variables of the time series decomposition.
df_descomposition = pd.DataFrame()
# We use both models for the decomposition of the time series.
if all_model:
list_model = ['additive', 'multiplicative']
for model in list_model:
decomposition = seasonal_decompose(x=df[column_value], model=model)
df_descomposition['trend_' + model] = decomposition.trend
df_descomposition['seasonal_' + model] = decomposition.seasonal
df_descomposition['residual_' + model] = decomposition.resid
# We use the parameter-passed model for the decomposition of the time series.
else:
decomposition = seasonal_decompose(x=df[column_value], model=model)
df_descomposition['trend_' + model] = decomposition.trend
df_descomposition['seasonal_' + model] = decomposition.seasonal
df_descomposition['residual_' + model] = decomposition.resid
# If merge is True we do a merge to have the results of the decomposition in the df passed by parameter.
if merge:
df = df.merge(df_descomposition.reset_index(), on=column_date)
return df
else:
return df_descomposition
df_descomposition = descomposition_time_series(df=df, column_date='date', column_value='num_passengers')
# df = descomposition_temporal_serie(df=df, column_date='fecha', column_value='npasajeros', merge=True)
df_descomposition.head()
| trend_additive | seasonal_additive | residual_additive | trend_multiplicative | seasonal_multiplicative | residual_multiplicative | |
|---|---|---|---|---|---|---|
| date | ||||||
| 2015-01-01 | NaN | 455.438229 | NaN | NaN | 1.146928 | NaN |
| 2015-01-02 | NaN | 312.146137 | NaN | NaN | 1.097037 | NaN |
| 2015-01-03 | NaN | -841.657434 | NaN | NaN | 0.724106 | NaN |
| 2015-01-04 | 2564.857143 | -1366.438047 | 1061.580904 | 2564.857143 | 0.557270 | 1.581173 |
| 2015-01-05 | 2961.857143 | 349.443331 | -544.300474 | 2961.857143 | 1.113554 | 0.838946 |
We observe that we already have a dataframe with the values of the decomposition of our time series, now let's graph.
models = ['additive', 'multiplicative']
colors = ['red', 'yellow']
n = 0
for model in models:
print(f'*******************************************Graficando serie de tendencia con el modelo {model}*******************************************')
fig = graphics_scatter(
df = df_descomposition.reset_index(),
column1 = 'date',
column2 = 'trend_'+model,
x_title = 'Date',
y_title = 'Trend',
colors = [colors[n]],
width = 1500,
height = 600,
bg_transparent = True,
rotate = 280
)
fig.show()
n += 1
*******************************************Graficando serie de tendencia con el modelo additive*******************************************
*******************************************Graficando serie de tendencia con el modelo multiplicative*******************************************
In the graphs above we can clearly see that the trend of our time series is linear, since it increases and decreases constantly over time.
models = ['additive', 'multiplicative']
colors = ['red', 'yellow']
n = 0
for model in models:
print(f'*******************************************Graficando serie de estacionalidad con el modelo {model}*******************************************')
fig = graphics_scatter(
df = df_descomposition.reset_index(),
column1 = 'date',
column2 = 'seasonal_'+model,
x_title = 'Date',
y_title = 'Seasonality',
colors = [colors[n]],
width = 1500,
height = 600,
bg_transparent = True,
rotate = 280
)
fig.show()
n += 1
*******************************************Graficando serie de estacionalidad con el modelo additive*******************************************
*******************************************Graficando serie de estacionalidad con el modelo multiplicative*******************************************
Above we observe how the time series behaves in periods of the month, observing that the seasonality is constant, that is, it shows similar patterns in the same periods of each month. We can determine that the seasonality period of our time series can be between 13 and 14.
models = ['additive', 'multiplicative']
colors = ['red', 'yellow']
n = 0
for model in models:
print(f'*******************************************Graficando serie de residuos con el modelo {model}*******************************************')
fig = graphics_scatter(
df = df_descomposition.reset_index(),
column1 = 'date',
column2 = 'residual_'+model,
x_title = 'Date',
y_title = 'Residue',
colors = [colors[n]],
width = 1500,
height = 600,
bg_transparent = True,
rotate = 280
)
fig.show()
n += 1
*******************************************Graficando serie de residuos con el modelo additive*******************************************
*******************************************Graficando serie de residuos con el modelo multiplicative*******************************************
In the graph above we can see that the residuals are randomly dispersed around zero (in the graph with the additive model), which indicates that the decomposition model is capable of capturing most of the variability in the time series.
The Dickey-Fuller test is a statistical test used to determine whether a time series has a unit root or not. A unit root means that there is a trend in the data and that past values have a significant impact on future values.
To determine from the DF test whether our series is stationary or not, even though there is no specific p value that determines whether a series is stationary or not, we will use the significance value that is usually used which is 0.05
result = sts.adfuller(df['num_passengers'])
print('ADF Statistical: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistical: -2.542154 p-value: 0.105552 Critical values: 1%: -3.439 5%: -2.865 10%: -2.569
Above we can see that the value of p is greater than 0.05, so we should not reject the null hypothesis, this suggests that the time series in question is not suitable for its use in time series models that assume stationarity. Therefore we will have to use some techniques to make our time series stationary.
A time series is considered stationary if its statistical behavior does not change over time. That is, the mean, variance, and autocovariance of the time series are constant over time. To make a time series stationary, it is necessary to remove trends and seasonal patterns from the series.
There are several techniques to make a time series stationary. such as differentiation, decomposition and Box-Cox transformation. Differentiation involves subtracting the current value of the time series from the previous value:
$$\Delta y_t = y_t - y_{t-1}$$where $y_t$ is the value of the time series at time $t$, and $y_{t-1}$ is the value of the time series at time $t-1$. The operation $\Delta$ is read as "delta" and refers to the difference.
In pandas we have a function (pandas.core.series.Series.diff()) that allows us to apply differentiation on a data set.
df['diff'] = df['num_passengers'].diff()
result = sts.adfuller(df['diff'].dropna())
print('ADF Statistical: %f' % result[0])
print('p-value: ' + str(result[1]))
print('Critical values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistical: -7.734929 p-value: 1.1001087665778645e-11 Critical values: 1%: -3.439 5%: -2.865 10%: -2.569
We observe that with the differentiation the **p** value took a value less than 0.05, which means that now our series is stationary.
plot_acf(df['diff'].dropna(),zero=False)
plt.xlim(0,20)
plt.xticks(np.arange(0,20,1));
In the graph above we can see that the first correlation coefficients are below the shaded band, which can indicate that the time series is negatively correlated in the first delays, this means that as one value of the series increases the next value decreases and vice versa, that is, there is an inverse relationship between the values of the series at different moments in time.
Now we are going to make a partial autocorrelation plot (PACF), the difference between the autocorrelation plot (ACF) and the PACF lies in how the correlation between two values of the time series is calculated. In the ACF plot each autocorrelation coefficient is calculated as a function of the previous coefficients, while in the PACF plot each coefficient is calculated after having eliminated the influence of all intermediate values.
plot_pacf(df['diff'].dropna(),zero=False,lags=40,method='ols',alpha=0.05)
plt.xticks(np.arange(0,40,2))
plt.show()
In the graph above we can visually observe that our time series is stationary and the values are negatively correlated.
To build the time series model we will use the ARIMA model (Autoregressive Integrated Moving Average), it is a statistical model used to analyze and predict time series. It is made up of three components: the autoregressive model (AR), the moving average model (MA) and the integrated differentiation (I). The general form of the ARIMA model can be expressed as ARIMA(p,d,q), where:
The general formula of the ARIMA model can be expressed as:
$$\phi_p(L)(1-L)^d y_t = \theta_q(L) \epsilon_t$$where $y_t$ is the value of the time series at time $t$, $\epsilon_t$ is the error term at time $t$, $L$ is the lag operator, $\phi_p(L)$ is the autoregressive polynomial of order $p$, $\theta_q(L)$ is the moving average polynomial of order $q$, and $(1-L)^d$ is the integrated differentiation operator of order $d$ .
The autoregressive polynomial of order $p$ can be expressed as:
$$\phi_p(L) = 1 - \phi_1 L - \phi_2 L^2 - \cdots - \phi_p L^p$$where $\phi_1, \phi_2, \cdots, \phi_p$ are the autoregressive coefficients.
The moving average polynomial of order $q$ can be expressed as:
$$\theta_q(L) = 1 + \theta_1 L + \theta_2 L^2 + \cdots + \theta_q L^q$$where $\theta_1, \theta_2, \cdots, \theta_q$ are the moving average coefficients.
The integrated differentiation operation can be expressed as:
$$(1-L)^d y_t = \sum _{i=0}^{d} \binom{d}{i} (-1)^i y_{t-i}$$where $\binom{d}{i}$ is the binomial coefficient used to calculate the combinations of $i$ elements of a set of $d$ elements.
To do this, we divide the dataset into 700 samples for train and 90 for test.¶
df.shape
(790, 12)
train = df.set_index('date')["num_passengers"].iloc[:700]
test = df.set_index('date')["num_passengers"].iloc[700:]
print(train.shape, test.shape)
(700,) (90,)
To build the ARIMA model we must take into account the values we need, as we mentioned before these are:
The parameter p (autoregressive order). In our autocorrelation graph (ACF) we can see that we have the first significant delay in the first, so we know that the value of this parameter is p = 1.
The parameter d (order of integration). It refers to the number of times the series must be differentiated for it to be stationary. In our we differentiate once so that our time series is stationary, therefore, we must tell the model that d = 1
The q parameter (Moving Average Order). This value can be estimated from the partial autocorrelation plot (PACF) and the first significant lag value on the plot is taken. In our PACF graph we can see that the first significant lagged value is the first since it protrudes minimally from the shaded band, therefore, when we go to create the model ARIMA we must tell it that q = 1
Having the values of p, d and q are:
p = 1
d = 1
q = 1
We can now create the ARIMA model.
# We define the model variables
p = 1
d = 1
q = 1
# We create the model
model = ARIMA(train, order=(p,d,q))
# We train the model
model_fit = model.fit()
Once the model is trained, we make predictions and calculate the corresponding metrics to measure the model
# Make predictions
predictions = model_fit.forecast(steps=len(test), alpha=0.05) # 95% conf
# We calculate metrics to measure the model
mse = mean_squared_error(test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test, predictions)
mape = np.mean(np.abs((test - predictions) / test)) * 100
accuracy = round(100 - mape, 2)
# We print the metrics of our model
print(f'''
The model with the best results:
- order (p:{p}, d:{d}, q:{q})
- RMSE: {rmse}
- MAE: {mae}
- MAPE: {round(mape, 2)}%
- Accuracy: {round(100 - mape, 2)}%
''')
The model with the best results:
- order (p:1, d:1, q:1)
- RMSE: 949.4568135233638
- MAE: 706.1399573261943
- MAPE: 34.04%
- Accuracy: 65.96%
Above we observe that the root mean square of the square prediction errors of the model is 949.45 units, which indicates that our model is not fitting well to the data and there is a significant amount of error in the predictions, likewise In this way, it can be seen that the values of MAE (mean absolute error) and MAPE (mean absolute percentage error) are high, which confirms that our model is not adjusting well to the data. Let's graphically look at the predictions against the actual data:
def graphics_preddictions(train:pd.core.series.Series, test:pd.core.series.Series, predictions:pd.core.series.Series, x_title:str, y_title:str, width:int, height:int, bg_transparent:bool, colors:list, all_data:bool, rotate:int):
"""
DESCRIPTION:
This function is responsible for graphing the time series divided into three (training data, test or validation data and predictions).
PARAMETERS:
train (pandas.core.series.Series): Training data set.
test (pandas.core.series.Series): Validation data set.
predictions (pandas.core.series.Series) – Predictions dataset.
x_title (str): x-axis title.
y_title (str): y-axis title.
width (int): width of the graph (pixels).
height (int): height of the graph (pixels).
bg_transparent (bool): Defines whether the graph will be transparent or not. If the value is True the graph will be transparent and if it is False it will have a white background.
colors (list): list of colors for each data set (train, test and predictions).
all_data (bool): Defines whether the graph will include the training data set or not. If True includes the time series of the training data,
If it is False, only the validation series and predictions are graphed.
RETURN:
fig (plotly.graph_objs._figure.Figure): Figure with the specified characteristics.
"""
if all_data:
fig = go.Figure(data=[
go.Scatter(name='<i>Training data</i>', x=train.index.astype(str),
y=train, mode='lines+ markers',
marker_color=colors[0], text=train, marker_line_color=colors[0]),
go.Scatter(name='<i>Validation data</i>', x=test.index.astype(str),
y=test, mode='lines+ markers',
marker_color=colors[1], text=test, marker_line_color=colors[1]),
go.Scatter(name='<i>Predictions</i>', x=predictions.index.astype(str),
y=predictions, mode='lines+ markers',
marker_color=colors[2], text=predictions, marker_line_color=colors[2]),
])
else:
fig = go.Figure(data=[
go.Scatter(name='<i>Validation data</i>', x=test.index.astype(str),
y=test, mode='lines+ markers',
marker_color=colors[1], text=test, marker_line_color=colors[1]),
go.Scatter(name='<i>Predictions</i>', x=predictions.index.astype(str),
y=predictions, mode='lines+ markers',
marker_color=colors[2], text=predictions, marker_line_color=colors[2]),
])
fig = graphics_transparent(fig, bg_transparent, x_title, y_title, width, height, rotate)
return fig
fig = graphics_preddictions(
train=train,
test=test,
predictions=predictions,
x_title='Date',
y_title='Number of passengers',
width=1400,
height=700,
bg_transparent=True,
colors=['red', 'blue', 'yellow'],
all_data = False,
rotate = -45
)
fig.show()
By comparing the prediction series (yellow line) with the validation series (blue line) we can observe and visually determine that our model is not fitting the data well and there is a significant amount of error in the predictions, so we should use techniques to improve the performance of our model.
Exogenous variables are independent variables that can affect the variable of interest, in this case, they can affect the variable npassengers. The importance of exogenous variables in the SARIMAX model lies in their ability to improve the precision and predictive capacity of the model. Now, in our data set we have more than 10 variables and we must select the most appropriate exogenous variables for our SARIMAX model. To do this, we will do a correlation analysis and, also, we will do a principal components analysis.
For this analysis we will use the Pearson correlation
df = pd.get_dummies(df)
df.head()
| date | month | holiday | num_passengers | event_intensity | rain_intensity | traffic_occupancy | week_of_month | day_of_week | year | diff | seasons_autumn | seasons_spring | seasons_summer | seasons_winter | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-01 | 1 | 1 | 1125 | 0 | 0.0 | 0.0 | 0 | 3 | 2015 | NaN | False | False | True | False |
| 1 | 2015-01-02 | 1 | 0 | 3592 | 0 | 0.0 | 0.0 | 0 | 4 | 2015 | 2467.0 | False | False | True | False |
| 2 | 2015-01-03 | 1 | 0 | 3001 | 0 | 0.0 | 0.0 | 0 | 5 | 2015 | -591.0 | False | False | True | False |
| 3 | 2015-01-04 | 1 | 0 | 2260 | 0 | 0.0 | 0.0 | 0 | 6 | 2015 | -741.0 | False | False | True | False |
| 4 | 2015-01-05 | 1 | 0 | 2767 | 0 | 0.0 | 0.0 | 0 | 0 | 2015 | 507.0 | False | False | True | False |
df = df.replace({True: 1, False: 0})
import seaborn as sns
import matplotlib.pyplot as plt
# Select only numeric columns from the DataFrame
numeric_df = df.select_dtypes(include=[float, int])
# Calculate correlation with 'num_passengers'
corr = numeric_df.corrwith(df['num_passengers']).reset_index()
corr.columns = ['Index', 'Correlations']
corr = corr.set_index('Index')
corr = corr.sort_values(by=['Correlations'], ascending=False)
plt.figure(figsize=(4, 8)) # Adjusted the figsize to better fit the data
fig = sns.heatmap(corr, annot=True, fmt=".2g", cmap='YlGnBu')
plt.title("Correlation of Variables with Number of Passengers")
plt.show()
In the graph above we can see the correlation of each variable with our target variable (npassengers), observing that there is no strong correlation between them. However, it is observed that the occupation_traffic variable has a moderate correlation. Despite this, we will create a function that will receive each of the exogenous variables in our data set and will return the model with the best results.
def get_best_model(df:pd.DataFrame, exogenous_variable:str, p:int, d:int, q:int, s:int, return_model:bool = False):
"""
DESCRIPTION:
This function is responsible for creating the model with the exogenous variable passed by parameter.
PARAMETERS:
df (DataFrame): data set with the time series.
exogenous_variable (str): exogenous variable with which the ARIMAX model will be trained.
p (int): order of the seasonal autoregressive term.
d (int): order of seasonal differentiation.
q (int): order of the seasonal moving average term.
s (int): length of the seasonal cycle.
return_model (bool): By default this parameter is False and returns metrics to measure the model. If True, returns only the trained model.
RETURN:
If return_model is True:
rmse (float): deviation between the predicted and actual values (average magnitude of squared errors).
mae (float): deviation between the predicted and actual values (average magnitude of absolute errors).
mape (float): average percentage of the deviation between the predicted values and the actual values.
accuracy (float): model precision.
else:
model_fit (statsmodels.tsa.statespace.sarimax.SARIMAXResultsWrapper): model trained with the exogenous variable passed by parameter
"""
import statsmodels.api as sm
# specify endogenous and exogenous variables
endog_train = df.set_index('date')['num_passengers'][:700]
exog_train = df.set_index('date')[exogenous_variable][:700]
# Adjust SARIMAX model
model = sm.tsa.SARIMAX(endog=endog_train, exog=exog_train, order=(p, d, q), seasonal_order=(p, d, q, s))
model_fit = model.fit(disp=False)
# Make predictions
exog_test = df.set_index('date')[exogenous_variable][700:]
predictions = model_fit.predict(exog=exog_test, start=exog_test.index[0], end=exog_test.index[-1])
# We calculate metrics to measure the model
mse = mean_squared_error(exog_test, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(test, predictions)
mape = np.mean(np.abs((test - predictions) / test)) * 100
accuracy = round(100 - mape, 2)
if return_model:
return model_fit
else:
return rmse, mae, mape, accuracy
def create_model_arima(df:pd.DataFrame, exogenous_variables:list, p_min:int, p_max:int, d:int, q_min:int, q_max:int, seasonal_s:int, max_accuracy:float, graphics:bool):
'''
DESCRIPTION:
This function is responsible for training models with different values of p, d and q in order to find the model with the most accuracy.
high.
PARAMETERS:
train (Series): Data set to train the time series model.
test (Series): test set to validate and measure the accuracy of our model.
p_min (int): minimum value of p.
p_max (int): maximum value of p.
d (int): number of times that had to be differentiated for the series to be stationary.
q_min (int): minimum value of q.
q_max (int): maximum value of q.
seasonal_s (int): seasonality period of the time series.
max_accuracy (float): desired accuracy. When the accuracy of the model is equal to or greater than max_accuracy, the process will stop and said model will be returned.
graphics (bool): if this value is true, a graph is made for each model.
RETURN:
df_results (dataFrame): dataFrame with RMSE, MAE and MAPE of each trained model.
model_fit (model.ARIMAResultsWrapper): Model with best results.
'''
list_rmse = list()
list_mae = list()
list_mape = list()
p_d_q_list = list()
seasonal_ = list()
list_accuracy = list()
list_exo_variables = list()
for p, q in zip(range(p_min, p_max), range(q_min, q_max)):
print(f'***********************************Processing model with p values:{p}, d:{d}, q:{q}***********************************')
for exogenous_variable in exogenous_variables:
rmse, mae, mape, accuracy = get_best_model(df, exogenous_variable, p, d, q, seasonal_s)
# We save each of the metrics in their corresponding lists
list_rmse.append(rmse)
list_mae.append(mae)
list_mape.append(mape)
p_d_q_list.append(f'p:{p}, d:{d}, q:{q}')
seasonal_.append(f'p:{p}, d:{d}, q:{q}, s:{seasonal_s}')
list_accuracy.append(accuracy)
list_exo_variables.append(exogenous_variable)
print(f'''
Accuracy with s:{seasonal_s} and the variable: {exogenous_variable}:
- seasonal_order (p:{p}, d:{d}, q:{q}, s:{seasonal_s})
- Accuracy: {accuracy}%
''')
if max(list_accuracy) >= max_accuracy:
print(f'The desired accuracy has been reached ({max(list_accuracy)}), The process will stop and the model with said accuracy will be returned.')
break
else:
pass
if graphics:
plt.figure(figsize=(10, 3), dpi=100)
plt.plot(list_mape, label='Mape')
plt.plot(list_accuracy, label='Accuracy',color='r')
plt.title('training curve: MAPE y ACCURACY')
plt.legend(loc='best', fontsize=10)
plt.show()
else:
pass
# We create a DataFrame with the results of each model
df_results = pd.DataFrame({'rmse':list_rmse, 'mae':list_mae, 'mape': list_mape, 'accuracy':list_accuracy,
'order (p, d y q)':p_d_q_list, 'seasonal_order (p, d, q y s)':seasonal_, 'exo_variable':list_exo_variables})
# We obtain the best values of p, d, 1 and s to create our ARIMA model
p_d_q_s = re.findall(r'\d+', df_results.nsmallest(10, 'mape').reset_index(drop=True)['seasonal_order (p, d, q y s)'][0])
p, d, q, s = int(p_d_q_s[0]), int(p_d_q_s[1]), int(p_d_q_s[2]), int(p_d_q_s[3])
exogenous_variable = df_results.nsmallest(10, 'mape').reset_index(drop=True)['exo_variable'][0]
print(f'''
*********************************************************************************************************
The model with the best results:
- Exogenous variable: {exogenous_variable}
- order (p:{p}, d:{d}, q:{q})
- seasonal_order (p:{p}, d:{d}, q:{q}, s:{s})
- RMSE: {min(list_rmse)}
- MAE: {min(list_mae)}
- MAPE: {round(min(list_mape), 2)}%
- Accuracy: {100 - round(min(list_mape), 2)}%
*********************************************************************************************************
''')
# We create a new model with the best values of p, d, q and d.
model_fit = get_best_model(df, exogenous_variable, p, d, q, s, return_model=True)
return df_results, model_fit
df.columns
Index(['date', 'month', 'holiday', 'num_passengers', 'event_intensity',
'rain_intensity', 'traffic_occupancy', 'week_of_month', 'day_of_week',
'year', 'diff', 'seasons_autumn', 'seasons_spring', 'seasons_summer',
'seasons_winter'],
dtype='object')
exogenous_variables =['month', 'holiday', 'event_intensity',
'rain_intensity', 'traffic_occupancy', 'week_of_month', 'day_of_week',
'year', 'seasons_autumn', 'seasons_spring', 'seasons_summer',
'seasons_winter']
Once the function is created, we call it and hope that it returns the model with the best performance.
# We run the create_model_arima function
df_results, model_fit = create_model_arima(df=df,
exogenous_variables= exogenous_variables,
p_min=0,
p_max=11,
d=1,
q_min=0,
q_max=11,
seasonal_s=14,
max_accuracy=85.0,
graphics=False)
***********************************Processing model with p values:0, d:1, q:0***********************************
Accuracy with s:14 and the variable: month:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 73.39%
Accuracy with s:14 and the variable: holiday:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 75.82%
Accuracy with s:14 and the variable: event_intensity:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 72.18%
Accuracy with s:14 and the variable: rain_intensity:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 70.7%
Accuracy with s:14 and the variable: traffic_occupancy:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 83.71%
Accuracy with s:14 and the variable: week_of_month:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 75.74%
Accuracy with s:14 and the variable: day_of_week:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 72.18%
Accuracy with s:14 and the variable: year:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 75.42%
Accuracy with s:14 and the variable: seasons_autumn:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 72.18%
Accuracy with s:14 and the variable: seasons_spring:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 72.34%
Accuracy with s:14 and the variable: seasons_summer:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 59.08%
Accuracy with s:14 and the variable: seasons_winter:
- seasonal_order (p:0, d:1, q:0, s:14)
- Accuracy: 72.18%
***********************************Processing model with p values:1, d:1, q:1***********************************
Accuracy with s:14 and the variable: month:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 81.41%
Accuracy with s:14 and the variable: holiday:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 85.28%
Accuracy with s:14 and the variable: event_intensity:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 80.7%
Accuracy with s:14 and the variable: rain_intensity:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 79.05%
Accuracy with s:14 and the variable: traffic_occupancy:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 84.23%
Accuracy with s:14 and the variable: week_of_month:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 81.44%
Accuracy with s:14 and the variable: day_of_week:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 80.7%
Accuracy with s:14 and the variable: year:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 75.58%
Accuracy with s:14 and the variable: seasons_autumn:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 80.65%
Accuracy with s:14 and the variable: seasons_spring:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 80.94%
Accuracy with s:14 and the variable: seasons_summer:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 66.56%
Accuracy with s:14 and the variable: seasons_winter:
- seasonal_order (p:1, d:1, q:1, s:14)
- Accuracy: 80.81%
The desired accuracy has been reached (85.28), The process will stop and the model with said accuracy will be returned.
*********************************************************************************************************
The model with the best results:
- Exogenous variable: holiday
- order (p:1, d:1, q:1)
- seasonal_order (p:1, d:1, q:1, s:14)
- RMSE: 2000.0937746145842
- MAE: 346.0444711679569
- MAPE: 14.72%
- Accuracy: 85.28%
*********************************************************************************************************
# Observe results of each trained model
df_results.nlargest(10, 'accuracy')
| rmse | mae | mape | accuracy | order (p, d y q) | seasonal_order (p, d, q y s) | exo_variable | |
|---|---|---|---|---|---|---|---|
| 13 | 3553.228998 | 346.044471 | 14.723981 | 85.28 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | holiday |
| 16 | 3433.268553 | 354.194848 | 15.765093 | 84.23 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | traffic_occupancy |
| 4 | 3230.926393 | 395.551844 | 16.286826 | 83.71 | p:0, d:1, q:0 | p:0, d:1, q:0, s:14 | traffic_occupancy |
| 17 | 3599.814584 | 416.995129 | 18.564744 | 81.44 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | week_of_month |
| 12 | 3601.331155 | 416.449450 | 18.594256 | 81.41 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | month |
| 21 | 3621.125302 | 429.251807 | 19.063522 | 80.94 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | seasons_spring |
| 23 | 3624.645553 | 432.301190 | 19.186023 | 80.81 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | seasons_winter |
| 14 | 3628.989933 | 435.185221 | 19.295999 | 80.70 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | event_intensity |
| 18 | 3626.374854 | 435.193734 | 19.296143 | 80.70 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | day_of_week |
| 20 | 3631.074975 | 436.626077 | 19.347444 | 80.65 | p:1, d:1, q:1 | p:1, d:1, q:1, s:14 | seasons_autumn |
# We make predictions with the best model returned by the function
exog_test = df.set_index('date')['holiday'][700:]
predictions = model_fit.predict(exog=exog_test, start=exog_test.index[0], end=exog_test.index[-1])
# We graph
endog_train = df.set_index('date')['num_passengers'][:700]
endog_test = df.set_index('date')['num_passengers'][700:]
fig = graphics_preddictions(
train=endog_train,
test=endog_test,
predictions=predictions,
x_title='Date',
y_title='Numero of passengers',
width=1400,
height=700,
bg_transparent=True,
colors=['red', 'blue', 'yellow'],
all_data = False,
rotate = -45
)
fig.show()
s = 14. d = 1 were created.p = 1q = 1accuracy = 85.28%), observing in the last validation graph how well the values fit. predicted values to actual values.